# -*- coding: utf-8 -*-
"""SPAP_2021_LSQ_TableS2_Other.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1XdKFE4Hs1M5XB11liIAtFFc42pXrlrSZ

**1. Install packages and mount drive**
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score

from google.colab import drive
drive.mount("/content/drive")

"""**2. Load data**"""

data_path = '/content/drive/MyDrive/spap_state/spap_state_attention/data/'
df = pd.read_csv(data_path + 'spap_state_attention_supplementary_train.csv')
df['text'] = np.where(df['quoted_status.full_text'].isnull(), df['full_text'], df['full_text'] + " " + df['quoted_status.full_text'])
df['covid'] = df['final_label'].astype(float)
df = df[['text','covid']]
print(len(df['covid']))
print(df['covid'].value_counts(normalize = True))

"""**3. Performance metrics**"""

def report_results(A, B):
    
    df = pd.DataFrame({'A': A,
                       'B': B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    acc = accuracy_score(B, A)
    f1 = f1_score(B, A)
    prec = precision_score(B, A)
    rec = recall_score(B, A)

    performance = [acc, prec, rec, f1]

    return performance

"""**4. 5-fold cross validation for random forest models**"""

# count vector

count = CountVectorizer(ngram_range = (1, 2), binary = True, lowercase = True)
train_val_X = count.fit_transform(df['text'])
train_val_y = df['covid']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 777) 

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  rf = RandomForestClassifier(n_estimators = 500, random_state = 777) # this is where your can change hyper-parameters for random forest 
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + count')
print('- acc:', round(np.mean(cv_acc), 4), round(np.std(cv_acc), 4))
print('- pre:', round(np.mean(cv_pre), 4), round(np.std(cv_pre), 4))
print('- rec:', round(np.mean(cv_rec), 4), round(np.std(cv_rec), 4))
print('- f1:', round(np.mean(cv_f1), 4), round(np.std(cv_f1), 4))

# TFIDF vector

tfidf = TfidfVectorizer(ngram_range = (1, 2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['covid']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 777)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  rf = RandomForestClassifier(n_estimators = 500, random_state = 777)
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + tfidf')
print('- acc:', round(np.mean(cv_acc), 4), round(np.std(cv_acc), 4))
print('- pre:', round(np.mean(cv_pre), 4), round(np.std(cv_pre), 4))
print('- rec:', round(np.mean(cv_rec), 4), round(np.std(cv_rec), 4))
print('- f1:', round(np.mean(cv_f1), 4), round(np.std(cv_f1), 4))

# glove

from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.200d.txt', encoding="utf8") # your path for glove.6B.200d.text
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1: ], dtype = 'float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis = 0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

glove_X = [sent2vec(x) for x in tqdm(df["text"])]
glove_X = np.array(glove_X)
y = df['covid']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state = 777)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  rf = RandomForestClassifier(n_estimators = 500, random_state = 777)
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + glove 200d')
print('- acc:', round(np.mean(cv_acc), 4), round(np.std(cv_acc), 4))
print('- pre:', round(np.mean(cv_pre), 4), round(np.std(cv_pre), 4))
print('- rec:', round(np.mean(cv_rec), 4), round(np.std(cv_rec), 4))
print('- f1:', round(np.mean(cv_f1), 4), round(np.std(cv_f1), 4))

"""**4. 5-fold cross validation for XGBoost models**"""

# count vector

count = CountVectorizer(ngram_range = (1, 2), binary = True, lowercase = True)
train_val_X = count.fit_transform(df['text'])
train_val_y = df['covid']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 777)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  xgb = XGBClassifier()
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + count')
print('- acc:', round(np.mean(cv_acc), 4), round(np.std(cv_acc), 4))
print('- pre:', round(np.mean(cv_pre), 4), round(np.std(cv_pre), 4))
print('- rec:', round(np.mean(cv_rec), 4), round(np.std(cv_rec), 4))
print('- f1:', round(np.mean(cv_f1), 4), round(np.std(cv_f1), 4))

# TFIDF vector

tfidf = TfidfVectorizer(ngram_range = (1, 2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['covid']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 777)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  xgb = XGBClassifier() # this is where your can change hyper-parameters for XGBoost
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + tfidf')
print('- acc:', round(np.mean(cv_acc), 4), round(np.std(cv_acc), 4))
print('- pre:', round(np.mean(cv_pre), 4), round(np.std(cv_pre), 4))
print('- rec:', round(np.mean(cv_rec), 4), round(np.std(cv_rec), 4))
print('- f1:', round(np.mean(cv_f1), 4), round(np.std(cv_f1), 4))

# glove

from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.200d.txt', encoding="utf8") # your path for glove.6B.200d.text
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1: ], dtype = 'float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis = 0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())


glove_X = [sent2vec(x) for x in tqdm(df["text"])]
glove_X = np.array(glove_X)
y = df['covid']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state = 777)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  xgb = XGBClassifier()
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + glove 200d')
print('- acc:', round(np.mean(cv_acc), 4), round(np.std(cv_acc), 4))
print('- pre:', round(np.mean(cv_pre), 4), round(np.std(cv_pre), 4))
print('- rec:', round(np.mean(cv_rec), 4), round(np.std(cv_rec), 4))
print('- f1:', round(np.mean(cv_f1), 4), round(np.std(cv_f1), 4))
